library(tidyverse)
library(readxl)
library(Hmisc)

# generate output dir path named data
output.dir <- file.path("survey_data")

# if source output dir does not exist, create it
if (!dir.exists(output.dir)) {
  dir.create(output.dir)
} else{
  print("Output folder already exists")
}

# Load info on survey completion
survey_completion <- read_excel("survey_completion.xlsx")

# Change names of data frame columns
colnames(survey_completion)
colnames(survey_completion) <- c("Email", "Technique", "Version", "Status")

# Convert "Status" column to factor and check its values
survey_completion$Status <- survey_completion$Status %>% as.factor()
survey_completion %>% select(Status) %>% summary()

# Filter only completed surveys
completed_survey <- survey_completion %>% filter(Status == "Complete")


get_dfs <- function(folder){
  
  # List excel files within a folder
  file.list <- list.files(path = folder, pattern='*.xlsx')
  file.list <- paste(folder, file.list, sep = "")
  # Read these excel files and create a list of their data frames
  df.list <- sapply(file.list, read_excel, simplify = FALSE)
  
  return(df.list)
}


aggregate_dfs <- function(df.list) {
  # Get an empty data frame with the columns
  aggr_df = df.list[[1]] %>% filter(`Start Date` == 0)

  # For each data frame in the list, add data from completed surveys
  for(i in 1:length(df.list)){
    print(paste("Processing dataframe: ", i, sep = ""))
    df_f <- df.list[[i]] %>% filter(`Email Address` %in% completed_survey$Email)
    aggr_df <- rbind(aggr_df, df_f)
  }
  return(aggr_df)
}

# Get a list of data frames from excel files
pcr_dfs <- get_dfs("PCR/")
mtt_dfs <- get_dfs("MTT/")
epm_dfs <- get_dfs("EPM/")

# Aggregate data from excel files, filtering only completed surveys
pcr_df <- pcr_dfs %>% aggregate_dfs()
mtt_df <- mtt_dfs %>% aggregate_dfs()
epm_df <- epm_dfs %>% aggregate_dfs()

rm(pcr_dfs, mtt_dfs, epm_dfs)

completed_survey <- completed_survey %>% rename("Email Address" = "Email")

# Join information from technique and lab version
pcr_df <- pcr_df %>% left_join(completed_survey %>% filter(Technique == "PCR"))
mtt_df <- mtt_df %>% left_join(completed_survey %>% filter(Technique == "MTT"))
epm_df <- epm_df %>% left_join(completed_survey %>% filter(Technique == "EPM"))

# Remove the first row of pcr
# this participant enrolled twice but only one of the surveys has data
pcr_df <- pcr_df %>% filter(`Respondent ID` != "11186105687")

# Export these aggregated data frames
pcr_df %>% write.csv(file.path(output.dir, "pcr_surveys.csv"), row.names = FALSE)
mtt_df %>% write.csv(file.path(output.dir, "mtt_surveys.csv"), row.names = FALSE)
epm_df %>% write.csv(file.path(output.dir, "epm_surveys.csv"), row.names = FALSE)

# Abbreviate questions, as they are too large to use as column names

# Get a data frame with all 7 questions for each study (from corresponding column names)
study_questions <- pcr_df %>% select(c(57:63))
study_questions <- colnames(study_questions) %>% as.data.frame()

colnames(study_questions) <- "Original"

# Add abbreviated codes for questions and export this data frame
question_codes <- c("Replication_probability","Replication_factors",
                   "Replication_rel_effect_size","Effect_size_factors",
                   "Difficulty", "Difficulty_factors", "Used_paper")
study_questions$abbreviated <- question_codes

study_questions %>% write.csv("Question_abreviations.csv", row.names = FALSE)

# Data from each study is spread as columns, with 7 questions each. 
# The questions are the same but the column number is added to the end
# This should be parsed to a long format, using identical codes for questions 
# as column names, and indicating which study it comes from

parse_study_questions <- function(df, start_col, ncols) {
  # create an empty data.frame to append study data
  parsed_df = data.frame(matrix(ncol = 11, nrow = 0))
  parsed_colnames <- c("Email Address", question_codes, "Version", "Technique", "Study")
  colnames(parsed_df) <- parsed_colnames
  
  # Starting column where questions start for each study
  current_col = start_col
  for (i in 1:20){
    print(paste("Processing study: ", i, sep = ""))
    print(paste("Corresponding start column: ", current_col, sep = ""))
    
    # Select the desired columns, study questions and identifier
    study_df <- df %>% 
      select(`Email Address`, c(current_col:(current_col + ncols - 1)), Technique, Version)
    
    # Rename columns using standard question codes 
    colnames(study_df) <- c("Email Address", question_codes, "Technique", "Version")
    
    # Add study number column and bind to larger data frame
    study_df <- study_df %>% mutate(Study = i)
    parsed_df <- rbind(parsed_df, study_df)
    
    current_col = current_col + ncols
  }
  
  # Get other Participant Information and join by email
  participant_cols = c(6,20:56)
  participant_cols_data <- df %>% select(participant_cols)
  parsed_df <- parsed_df %>% left_join(participant_cols_data, by = "Email Address")
  rm(participant_cols, participant_cols_data)
  
  return(parsed_df)
}

parsed_pcr <- parse_study_questions(pcr_df, 57, 7)
parsed_mtt <- parse_study_questions(mtt_df, 57, 7)
parsed_epm <- parse_study_questions(epm_df, 57, 7)


# Convert to the appropriate column types
parsed_pcr$Replication_probability <- parsed_pcr$Replication_probability %>% as.numeric()
parsed_pcr$Replication_rel_effect_size <- parsed_pcr$Replication_rel_effect_size %>% as.numeric()
parsed_pcr$Difficulty <- parsed_pcr$Difficulty %>% as.numeric()
parsed_pcr$Study <- parsed_pcr$Study  %>% as.factor()
  
parsed_mtt$Replication_probability <- parsed_mtt$Replication_probability %>% as.numeric()
parsed_mtt$Replication_rel_effect_size <- parsed_mtt$Replication_rel_effect_size %>% as.numeric()
parsed_mtt$Difficulty <- parsed_mtt$Difficulty %>% as.numeric()
parsed_mtt$Study <- parsed_mtt$Study  %>% as.factor()

parsed_epm$Replication_probability <- parsed_epm$Replication_probability %>% as.numeric()
parsed_epm$Replication_rel_effect_size <- parsed_epm$Replication_rel_effect_size %>% as.numeric()
parsed_epm$Difficulty <- parsed_epm$Difficulty %>% as.numeric()
parsed_epm$Study <- parsed_epm$Study  %>% as.factor()
  
# Remove specific data from studies in lab versions, which were blind to their own experiments.
parsed_epm$Version %>% unique()
parsed_mtt$Version %>% unique()
parsed_pcr$Version %>% unique()

# Check which studies should be removed from which versions
labversion_masks <- read.csv2("Lab_version_surveymasks.csv")

regexp <- "[[:digit:]]+"
labversion_masks <- labversion_masks %>% pivot_longer(
  cols = -c(Version, Technique), names_to = "Study", values_to = "Value") %>% 
  filter(Value == "Remove") %>% mutate(Study = str_extract(Study, regexp))

# Remove specific studies from lab versions of surveys
parsed_epm_f <- parsed_epm
for (labversion in unique(parsed_epm$Version)){
  print(labversion)
  parsed_epm_f <- parsed_epm_f %>% filter(
    !(Version == labversion & 
        Study %in% (labversion_masks %>% filter(Version == labversion))$Study)
    )
}

parsed_pcr_f <- parsed_pcr
for (labversion in unique(parsed_pcr$Version)){
  parsed_pcr_f <- parsed_pcr_f %>% filter(
    !(Version == labversion & 
        Study %in% (labversion_masks %>% filter(Version == labversion))$Study)
  )
}

parsed_mtt_f <- parsed_mtt
for (labversion in unique(parsed_mtt$Version)){
  parsed_mtt_f <- parsed_mtt_f %>% filter(
    !(Version == labversion & 
        Study %in% (labversion_masks %>% filter(Version == labversion))$Study)
  )
}

# Convert emails to factor values and use them as anonymous participant IDs
parsed_pcr_f$`Email Address` <- as.factor(parsed_pcr_f$`Email Address`)
parsed_epm_f$`Email Address` <- as.factor(parsed_epm_f$`Email Address`)
parsed_mtt_f$`Email Address` <- as.factor(parsed_mtt_f$`Email Address`)

parsed_pcr_f <- parsed_pcr_f %>% 
  mutate(AnonymousID = paste0("PCR_P", as.numeric(parsed_pcr_f$`Email Address`)))

parsed_epm_f <- parsed_epm_f %>% 
  mutate(AnonymousID = paste0("EPM_P", as.numeric(parsed_epm_f$`Email Address`)))

parsed_mtt_f <- parsed_mtt_f %>% 
  mutate(AnonymousID = paste0("MTT_P", as.numeric(parsed_mtt_f$`Email Address`)))

# Export
parsed_pcr_f %>% select(-`Email Address`) %>% 
  write.csv2(file.path(output.dir, "parsed_pcr_surveys.csv"), row.names = FALSE)

parsed_epm_f %>% select(-`Email Address`) %>% 
  write.csv2(file.path(output.dir, "parsed_epm_surveys.csv"), row.names = FALSE)

parsed_mtt_f %>% select(-`Email Address`) %>% 
  write_csv2(file.path(output.dir, "parsed_mtt_surveys.csv"))

# Edit colnames that were different between versions
names(parsed_mtt_f)[c(46,47)] <- c("How would you rate your theoretical knowledge of the technique?", "How much practical experience do you have with the technique?")
names(parsed_pcr_f)[c(46,47)] <- c("How would you rate your theoretical knowledge of the technique?", "How much practical experience do you have with the technique?")
names(parsed_epm_f)[c(43,46,47,48)] <- c("Are you a member of the Brazilian Reproducibility Initiative replication team?", "How would you rate your theoretical knowledge of the technique?", "How much practical experience do you have with the technique?", 
                                         "How would you rate your knowledge on basic statistics and research methodology")

# Join all surveys and export
joined_surveys <- parsed_pcr_f %>% rbind(parsed_epm_f) %>% rbind(parsed_mtt_f)

joined_surveys <- joined_surveys %>% select(-`Email Address`) 

joined_surveys %>% write.csv2(file.path(output.dir, "parsed_survey_data.csv"), row.names = FALSE)


# Simplify all colnames
# colnames(joined_surveys)
joined_surveys <- read.csv2(file = file.path("survey_data", "parsed_survey_data.csv"))

joined_surveys <- joined_surveys %>% dplyr::rename(
  Institutional_position = What.is.your.position.in.this.institution.,
  Institutional_position_other = "...21", 
  Years_in_position = For.how.many.years.have.you.been.in.this.position.,
  Highest_title = What.is.the.highest.academic.title.you.have.obtained.,
  Highest_title_other = "...24",
  Knowledge_area1 = What.is.are.the.main.area.s..of.knowledge.of.your.degrees.,
  Knowledge_area2 = "...26",
  Knowledge_area3 = "...27",
  Knowledge_area4 = "...28",
  Knowledge_area5 = "...29",
  Knowledge_area6 = "...30",
  Knowledge_area7 = "...30",
  Knowledge_area8 = "...31",
  Knowledge_area9 = "...32",
  Knowledge_area10 = "...33",
  Knowledge_area11 = "...34",
  Knowledge_area12 = "...35",
  Knowledge_area13 = "...36",
  Years_in_research = During.your.life..for.how.many.years.have.you.been.involved.in.experimental.research., 
  Research_area1 = In.what.areas.of.knowledge.have.you.been.involved.in.experimental.research.,
  Research_area2 = "...39",
  Research_area3 = "...40",
  Research_area4 = "...41",
  Research_area5 = "...42",
  Research_area6 = "...43",
  Research_area7 = "...44",
  Research_area8 = "...45",
  Research_area9 = "...46",
  Research_area10 = "...47",
  Research_area11 = "...48",
  Research_area12 = "...49",
  Main_research_topic = What.are.the.main.research.topics.that.you.have.worked.on.within.this.these.area.s..,
  BRI_member = Are.you.a.member.of.the.Brazilian.Reproducibility.Initiative.replication.team.,
  sReplication_concept = What.is.your.concept.of.a.successful.direct.replication.of.a.published.finding.,
  General_rep_rate = "According.to.this.criterion..what.is.your.estimate.of.the.general.reproducibility.rate.of.published.findings.in.the.biomedical.sciences...in.a.scale.of.0.100..",
  Theoretical_knowledge_technique = "How.would.you.rate.your.theoretical.knowledge.of.the.technique.",
  Practical_knowledge_technique = "How.much.practical.experience.do.you.have.with.the.technique.",
  Statistical_knowledge = "How.would.you.rate.your.knowledge.on.basic.statistics.and.research.methodology"
)

joined_surveys %>% write.csv2(file.path(output.dir, "parsed_survey_data.csv"), row.names = FALSE)

